perms$T.upper <- T.upper[match(perms$T.lower, T.lower)]     # Repeat for T
#Sample 1 experiment from each of the PT brackets
sam <- lapply(1:nrow(perms), function(x) which(input$P < perms$P.upper[x] & input$P >= perms$P.lower[x] & input$T < perms$T.upper[x] & input$T >= perms$T.lower[x])) #Creates each grid, which are defined by df perms
sam[sapply(sam, function(sam) length(sam)==0)] <- NA        # For sam if the length is 0 set as NA
samp <- lapply(1:length(sam), function(x) sample(sam[[x]], size = 1)) # Samples points from each gridspace
perms$samp <- unlist(samp) #Add sample to perms
#Find the number of experiments in each bracket
perms$n <- unlist(lapply(1:length(sam), function(x) length(which(is.na(sam[[x]]) == F))))
#Remove brackets from perm which don't have any experiments (or only have a single experiment)
no.perms <- which(is.na(perms$samp) | perms$n < 2)
perms <- perms[-no.perms,]
#N.test is the number of unique values of the perms id
test.ids[[i]] <- perms$samp
#End loop
}
##### FIND THE ID OF THE TRAIN IDS ####
train.ids <- lapply(1:length(test.ids), function(x) {sample(which(1:nrow(input) %!in% test.ids[[x]]), size = (nrow(input) - length(test.ids[[x]])), replace = F)})
#### SAVE TEST AND TRAIN IDS ####
save(test.ids, file = "testids.Rdata")
save(train.ids, file = "trainids.Rdata")
######################
#### DETERMINE SEE####
######################
#we recommend cleaning the global environment before running this code, sometimes if you don't you will get an error with the "times" argument at the end
####INSTALL REQUIRED PACKAGES IF NECESSARY
options(java.parameters = "-Xmx4g")
pack1 <- suppressWarnings(require(PerformanceAnalytics))
if(pack1 == FALSE) {install.packages("PerformanceAnalytics");library(PerformanceAnalytics)}
pack2 <- suppressWarnings(require(rJava))
if(pack2 == FALSE) {install.packages("rJava");library(rJava)}
pack3 <- suppressWarnings(require(extraTrees))
if(pack3 == FALSE) {install.packages("extraTrees");library(extraTrees)}
pack4 <- suppressWarnings(require(EnvStats))
if(pack4 == FALSE) {install.packages("EnvStats");library(EnvStats)}
pack5 <- suppressWarnings(require(xlsx))
if(pack5 == FALSE) {install.packages("xlsx");library(xlsx)}
pack6 <- suppressWarnings(require(readxl))
if(pack6 == FALSE) {install.packages("readxl");library(readxl)}
pack7 <- suppressWarnings(require(rstudioapi))
if(pack7 == FALSE) {install.packages("rstudioapi");library(rstudioapi)}
rm(pack1,pack2,pack3,pack4,pack5,pack6, pack7)
setwd(paste(dirname(rstudioapi::getActiveDocumentContext()$path)))
options(scipen = 999)
'%!in%' <- function(x,y)!('%in%'(x,y))
######### READ DATA AND REMOVE FILTERED EXPERIMENTS #######
#Load raw data
load("input.Rdata")
##### LOAD IN TEST AND TRAIN IDS
load("testids.Rdata")
load("trainids.Rdata")
#### What do you have? ####
#YOU DECIDE#
liq <- c("NoLiquid") #OR Liquid
#Set the oxides
ox <- c("SiO2", "TiO2","Al2O3", "Cr2O3","FeO","MgO", "MnO", "CaO", "Na2O")
liqox <- c("SiO2", "TiO2","Al2O3", "FeO","MgO", "MnO", "CaO", "Na2O","K2O")
var <- paste0(ox,".cpx") #adds .cpx to the cpx oxides
if(liq == "Liquid") {var <- c(var, paste0(liqox,".liq"))} #adds .liq to the liquid variables
#Select hyperparamters
r <- 200                #Number of test/train splits
n.cuts <- 1             #number of random cuts
n.tree <- 201           #number of trees
m.try <-length(var)*2/3 #this should be 12 for liquid and 6 for no liquid.
#PRESSURE
#Make empty lists to populate
Pred_P_mean <- list(); Pred_P_median <- list(); Pred_P_mode <- list()
Pred_P_all <- list(); Resid_P_mean <- list(); Resid_P_median <- list(); Resid_P_mode <- list()
SEE_P_mean <- list(); SEE_P_median <- list(); SEE_P_mode <- list()
R2_P_mean <- list(); R2_P_median <- list(); R2_P_mode <- list(); model_P <- list()
P_IQR <- list()
for(j in 1:r) {
#Train the pressure model
model_P <- extraTrees(x = input[train.ids[[j]], var], y = input$P[train.ids[[j]]], ntree = n.tree, mtry = m.try, numRandomCuts = n.cuts, numThreads = 8) ; print(j)
#Add pause
# Sys.sleep(time = 0.1)
#Predict the pressure in the testset by taking the mean value
Pred_P_mean[[j]] <- round(apply(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), 1, mean), digits = 1)
#Predict the pressure in the testset by taking the median value
Pred_P_median[[j]] <- round(apply(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), 1, median), digits = 1)
#Predict the pressure in the testset by taking the modal value
Pred_P_all[[j]] <- round(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), digits = 1)
Pred_P_tab <- apply(round(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), digits = 1), 1, table)
Pred_P_mode[[j]] <- unlist(lapply(Pred_P_tab, function(x) {as.numeric(names(sort(x, decreasing = T)[1]))}))
#Calculate the IQR of the voting distribution
P_IQR[[j]] <- round(apply(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), 1, IQR), digits = 1)
#Calculate the pressure residual from each of the models
Resid_P_mean[[j]] <- input$P[test.ids[[j]]]-unlist(Pred_P_mean[[j]])
Resid_P_median[[j]] <- input$P[test.ids[[j]]]-unlist(Pred_P_median[[j]])
Resid_P_mode[[j]] <- input$P[test.ids[[j]]]-unlist(Pred_P_mode[[j]])
#Calculate the pressure SEE for each study
SEE_P_mean[[j]] <- round((sum((Pred_P_mean[[j]]-input$P[test.ids[[j]]])^2)/length(input$P[test.ids[[j]]]))^(0.5), digits = 2)
SEE_P_median[[j]] <- round((sum((Pred_P_median[[j]]-input$P[test.ids[[j]]])^2)/length(input$P[test.ids[[j]]]))^(0.5), digits = 2)
SEE_P_mode[[j]] <- round((sum((Pred_P_mode[[j]]-input$P[test.ids[[j]]])^2)/length(input$P[test.ids[[j]]]))^(0.5), digits = 2)
#Calculte the pressure R for each study
R2_P_mean[[j]] <- round(summary(lm(Pred_P_mean[[j]]~input$P[test.ids[[j]]]))$r.squared, digits = 3)
R2_P_median[[j]] <- round(summary(lm(Pred_P_median[[j]]~input$P[test.ids[[j]]]))$r.squared, digits = 3)
R2_P_mode[[j]] <- round(summary(lm(Pred_P_mode[[j]]~input$P[test.ids[[j]]]))$r.squared, digits = 3)
#Add pause
# Sys.sleep(time = 0.1)
rm(model_P)
}
#TEMPERATURE
#Make empty lists to populate
Pred_T_mean <- list(); Pred_T_median <- list(); Pred_T_mode <- list()
Pred_T_all <- list(); Resid_T_mean <- list(); Resid_T_median <- list(); Resid_T_mode <- list()
SEE_T_mean <- list(); SEE_T_median <- list(); SEE_T_mode <- list()
R2_T_mean <- list(); R2_T_median <- list(); R2_T_mode <- list(); model_T <- list()
T_IQR <- list()
for(j in 1:r) {
#Train the pressure model
model_T <- extraTrees(x = input[train.ids[[j]], var], y = input$T[train.ids[[j]]], ntree = n.tree, mtry = m.try, numRandomCuts = n.cuts, numThreads = 8) ; print(j)
#Add pause
# Sys.sleep(time = 0.1)
#Predict the pressure in the testset by taking the mean value
Pred_T_mean[[j]] <- round(apply(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), 1, mean), digits = 0)
#Predict the pressure in the testset by taking the median value
Pred_T_median[[j]] <- round(apply(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), 1, median), digits = 0)
#Predict the pressure in the testset by taking the modal value
Pred_T_all[[j]] <- round(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), digits = 0)
Pred_T_tab <- apply(round(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), digits = 0), 1, table)
Pred_T_mode[[j]] <- unlist(lapply(Pred_T_tab, function(x) {as.numeric(names(sort(x, decreasing = T)[1]))}))
#Calculate the IQR of the voting distribution
T_IQR[[j]] <- round(apply(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), 1, IQR), digits = 0)
#Calculate the pressure residual from each of the models
Resid_T_mean[[j]] <- input$T[test.ids[[j]]]-unlist(Pred_T_mean[[j]])
Resid_T_median[[j]] <- input$T[test.ids[[j]]]-unlist(Pred_T_median[[j]])
Resid_T_mode[[j]] <- input$T[test.ids[[j]]]-unlist(Pred_T_mode[[j]])
#Calculate the pressure SEE for each study
SEE_T_mean[[j]] <- round((sum((Pred_T_mean[[j]]-input$T[test.ids[[j]]])^2)/length(input$T[test.ids[[j]]]))^(0.5), digits = 0)
SEE_T_median[[j]] <- round((sum((Pred_T_median[[j]]-input$T[test.ids[[j]]])^2)/length(input$T[test.ids[[j]]]))^(0.5), digits = 0)
SEE_T_mode[[j]] <- round((sum((Pred_T_mode[[j]]-input$T[test.ids[[j]]])^2)/length(input$T[test.ids[[j]]]))^(0.5), digits = 0)
#Calculte the pressure R for each study
R2_T_mean[[j]] <- round(summary(lm(Pred_T_mean[[j]]~input$T[test.ids[[j]]]))$r.squared, digits = 3)
R2_T_median[[j]] <- round(summary(lm(Pred_T_median[[j]]~input$T[test.ids[[j]]]))$r.squared, digits = 3)
R2_T_mode[[j]] <- round(summary(lm(Pred_T_mode[[j]]~input$T[test.ids[[j]]]))$r.squared, digits = 3)
#Add pause
# Sys.sleep(time = 0.1)
rm(model_T)
}
#### SAVE CATION DATA #####
output <- input[unlist(test.ids),]
output$r <- rep(1:r, times = lapply(test.ids, length))
#Add pressure data
output$Pred.P.mean <- unlist(Pred_P_mean)
output$Pred.P.med <- unlist(Pred_P_median)
output$Pred.P.mode <- unlist(Pred_P_mode)
output$Resid.P.mean <- unlist(Resid_P_mean)
output$Resid.P.med <- unlist(Resid_P_median)
output$Resid.P.mode <- unlist(Resid_P_mode)
output$SEE.P.mean <- rep(unlist(SEE_P_mean), times = lapply(test.ids, length))
output$SEE.P.med <- rep(unlist(SEE_P_median), times = lapply(test.ids, length))
output$SEE.P.mode <- rep(unlist(SEE_P_mode), times = lapply(test.ids, length))
output$R2.P.mean <- rep(unlist(R2_P_mean), times = lapply(test.ids, length))
output$R2.P.med <- rep(unlist(R2_P_median), times = lapply(test.ids, length))
output$R2.P.mode <- rep(unlist(R2_P_mode), times = lapply(test.ids, length))
#Add temperature data
output$Pred.T.mean <- unlist(Pred_T_mean)
output$Pred.T.med <- unlist(Pred_T_median)
output$Pred.T.mode <- unlist(Pred_T_mode)
output$Resid.T.mean <- unlist(Resid_T_mean)
output$Resid.T.med <- unlist(Resid_T_median)
output$Resid.T.mode <- unlist(Resid_T_mode)
output$SEE.T.mean <- rep(unlist(SEE_T_mean), times = lapply(test.ids, length))
output$SEE.T.med <- rep(unlist(SEE_T_median), times = lapply(test.ids, length))
output$SEE.T.mode <- rep(unlist(SEE_T_mode), times = lapply(test.ids, length))
output$R2.T.mean <- rep(unlist(R2_T_mean), times = lapply(test.ids, length))
output$R2.T.med <- rep(unlist(R2_T_median), times = lapply(test.ids, length))
output$R2.T.mode <- rep(unlist(R2_T_mode), times = lapply(test.ids, length))
##### SAVE AND EXPORT ####
final <- output
save(final, file = "final.Rdata")
write.csv(final,'final.csv')
#### print the SEE ####
mean(unlist(SEE_P_mean))
mean(unlist(SEE_T_mean))
######################
#### DETERMINE SEE####
######################
#we recommend cleaning the global environment before running this code, sometimes if you don't you will get an error with the "times" argument at the end
####INSTALL REQUIRED PACKAGES IF NECESSARY
options(java.parameters = "-Xmx4g")
pack1 <- suppressWarnings(require(PerformanceAnalytics))
if(pack1 == FALSE) {install.packages("PerformanceAnalytics");library(PerformanceAnalytics)}
pack2 <- suppressWarnings(require(rJava))
if(pack2 == FALSE) {install.packages("rJava");library(rJava)}
pack3 <- suppressWarnings(require(extraTrees))
if(pack3 == FALSE) {install.packages("extraTrees");library(extraTrees)}
pack4 <- suppressWarnings(require(EnvStats))
if(pack4 == FALSE) {install.packages("EnvStats");library(EnvStats)}
pack5 <- suppressWarnings(require(xlsx))
if(pack5 == FALSE) {install.packages("xlsx");library(xlsx)}
pack6 <- suppressWarnings(require(readxl))
if(pack6 == FALSE) {install.packages("readxl");library(readxl)}
pack7 <- suppressWarnings(require(rstudioapi))
if(pack7 == FALSE) {install.packages("rstudioapi");library(rstudioapi)}
rm(pack1,pack2,pack3,pack4,pack5,pack6, pack7)
setwd(paste(dirname(rstudioapi::getActiveDocumentContext()$path)))
options(scipen = 999)
'%!in%' <- function(x,y)!('%in%'(x,y))
######### READ DATA AND REMOVE FILTERED EXPERIMENTS #######
#Load raw data
load("input.Rdata")
##### LOAD IN TEST AND TRAIN IDS
load("testids.Rdata")
load("trainids.Rdata")
#### What do you have? ####
#YOU DECIDE#
liq <- c("NoLiquid") #OR Liquid
#Set the oxides
ox <- c("SiO2", "TiO2","Al2O3", "Cr2O3","FeO","MgO", "MnO", "CaO", "Na2O")
liqox <- c("SiO2", "TiO2","Al2O3", "FeO","MgO", "MnO", "CaO", "Na2O","K2O")
var <- paste0(ox,".cpx") #adds .cpx to the cpx oxides
if(liq == "Liquid") {var <- c(var, paste0(liqox,".liq"))} #adds .liq to the liquid variables
#Select hyperparamters
r <- 200                #Number of test/train splits
n.cuts <- 1             #number of random cuts
n.tree <- 201           #number of trees
m.try <-length(var)*2/3 #this should be 12 for liquid and 6 for no liquid.
#PRESSURE
#Make empty lists to populate
Pred_P_mean <- list(); Pred_P_median <- list(); Pred_P_mode <- list()
Pred_P_all <- list(); Resid_P_mean <- list(); Resid_P_median <- list(); Resid_P_mode <- list()
SEE_P_mean <- list(); SEE_P_median <- list(); SEE_P_mode <- list()
R2_P_mean <- list(); R2_P_median <- list(); R2_P_mode <- list(); model_P <- list()
P_IQR <- list()
for(j in 1:r) {
#Train the pressure model
model_P <- extraTrees(x = input[train.ids[[j]], var], y = input$P[train.ids[[j]]], ntree = n.tree, mtry = m.try, numRandomCuts = n.cuts, numThreads = 8) ; print(j)
#Add pause
# Sys.sleep(time = 0.1)
#Predict the pressure in the testset by taking the mean value
Pred_P_mean[[j]] <- round(apply(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), 1, mean), digits = 1)
#Predict the pressure in the testset by taking the median value
Pred_P_median[[j]] <- round(apply(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), 1, median), digits = 1)
#Predict the pressure in the testset by taking the modal value
Pred_P_all[[j]] <- round(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), digits = 1)
Pred_P_tab <- apply(round(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), digits = 1), 1, table)
Pred_P_mode[[j]] <- unlist(lapply(Pred_P_tab, function(x) {as.numeric(names(sort(x, decreasing = T)[1]))}))
#Calculate the IQR of the voting distribution
P_IQR[[j]] <- round(apply(predict(model_P, newdata = input[test.ids[[j]], var], allValues = T), 1, IQR), digits = 1)
#Calculate the pressure residual from each of the models
Resid_P_mean[[j]] <- input$P[test.ids[[j]]]-unlist(Pred_P_mean[[j]])
Resid_P_median[[j]] <- input$P[test.ids[[j]]]-unlist(Pred_P_median[[j]])
Resid_P_mode[[j]] <- input$P[test.ids[[j]]]-unlist(Pred_P_mode[[j]])
#Calculate the pressure SEE for each study
SEE_P_mean[[j]] <- round((sum((Pred_P_mean[[j]]-input$P[test.ids[[j]]])^2)/length(input$P[test.ids[[j]]]))^(0.5), digits = 2)
SEE_P_median[[j]] <- round((sum((Pred_P_median[[j]]-input$P[test.ids[[j]]])^2)/length(input$P[test.ids[[j]]]))^(0.5), digits = 2)
SEE_P_mode[[j]] <- round((sum((Pred_P_mode[[j]]-input$P[test.ids[[j]]])^2)/length(input$P[test.ids[[j]]]))^(0.5), digits = 2)
#Calculte the pressure R for each study
R2_P_mean[[j]] <- round(summary(lm(Pred_P_mean[[j]]~input$P[test.ids[[j]]]))$r.squared, digits = 3)
R2_P_median[[j]] <- round(summary(lm(Pred_P_median[[j]]~input$P[test.ids[[j]]]))$r.squared, digits = 3)
R2_P_mode[[j]] <- round(summary(lm(Pred_P_mode[[j]]~input$P[test.ids[[j]]]))$r.squared, digits = 3)
#Add pause
# Sys.sleep(time = 0.1)
rm(model_P)
}
#TEMPERATURE
#Make empty lists to populate
Pred_T_mean <- list(); Pred_T_median <- list(); Pred_T_mode <- list()
Pred_T_all <- list(); Resid_T_mean <- list(); Resid_T_median <- list(); Resid_T_mode <- list()
SEE_T_mean <- list(); SEE_T_median <- list(); SEE_T_mode <- list()
R2_T_mean <- list(); R2_T_median <- list(); R2_T_mode <- list(); model_T <- list()
T_IQR <- list()
for(j in 1:r) {
#Train the pressure model
model_T <- extraTrees(x = input[train.ids[[j]], var], y = input$T[train.ids[[j]]], ntree = n.tree, mtry = m.try, numRandomCuts = n.cuts, numThreads = 8) ; print(j)
#Add pause
# Sys.sleep(time = 0.1)
#Predict the pressure in the testset by taking the mean value
Pred_T_mean[[j]] <- round(apply(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), 1, mean), digits = 0)
#Predict the pressure in the testset by taking the median value
Pred_T_median[[j]] <- round(apply(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), 1, median), digits = 0)
#Predict the pressure in the testset by taking the modal value
Pred_T_all[[j]] <- round(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), digits = 0)
Pred_T_tab <- apply(round(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), digits = 0), 1, table)
Pred_T_mode[[j]] <- unlist(lapply(Pred_T_tab, function(x) {as.numeric(names(sort(x, decreasing = T)[1]))}))
#Calculate the IQR of the voting distribution
T_IQR[[j]] <- round(apply(predict(model_T, newdata = input[test.ids[[j]], var], allValues = T), 1, IQR), digits = 0)
#Calculate the pressure residual from each of the models
Resid_T_mean[[j]] <- input$T[test.ids[[j]]]-unlist(Pred_T_mean[[j]])
Resid_T_median[[j]] <- input$T[test.ids[[j]]]-unlist(Pred_T_median[[j]])
Resid_T_mode[[j]] <- input$T[test.ids[[j]]]-unlist(Pred_T_mode[[j]])
#Calculate the pressure SEE for each study
SEE_T_mean[[j]] <- round((sum((Pred_T_mean[[j]]-input$T[test.ids[[j]]])^2)/length(input$T[test.ids[[j]]]))^(0.5), digits = 0)
SEE_T_median[[j]] <- round((sum((Pred_T_median[[j]]-input$T[test.ids[[j]]])^2)/length(input$T[test.ids[[j]]]))^(0.5), digits = 0)
SEE_T_mode[[j]] <- round((sum((Pred_T_mode[[j]]-input$T[test.ids[[j]]])^2)/length(input$T[test.ids[[j]]]))^(0.5), digits = 0)
#Calculte the pressure R for each study
R2_T_mean[[j]] <- round(summary(lm(Pred_T_mean[[j]]~input$T[test.ids[[j]]]))$r.squared, digits = 3)
R2_T_median[[j]] <- round(summary(lm(Pred_T_median[[j]]~input$T[test.ids[[j]]]))$r.squared, digits = 3)
R2_T_mode[[j]] <- round(summary(lm(Pred_T_mode[[j]]~input$T[test.ids[[j]]]))$r.squared, digits = 3)
#Add pause
# Sys.sleep(time = 0.1)
rm(model_T)
}
#### SAVE CATION DATA #####
output <- input[unlist(test.ids),]
output$r <- rep(1:r, times = lapply(test.ids, length))
#Add pressure data
output$Pred.P.mean <- unlist(Pred_P_mean)
output$Pred.P.med <- unlist(Pred_P_median)
output$Pred.P.mode <- unlist(Pred_P_mode)
output$Resid.P.mean <- unlist(Resid_P_mean)
output$Resid.P.med <- unlist(Resid_P_median)
output$Resid.P.mode <- unlist(Resid_P_mode)
output$SEE.P.mean <- rep(unlist(SEE_P_mean), times = lapply(test.ids, length))
output$SEE.P.med <- rep(unlist(SEE_P_median), times = lapply(test.ids, length))
output$SEE.P.mode <- rep(unlist(SEE_P_mode), times = lapply(test.ids, length))
output$R2.P.mean <- rep(unlist(R2_P_mean), times = lapply(test.ids, length))
output$R2.P.med <- rep(unlist(R2_P_median), times = lapply(test.ids, length))
output$R2.P.mode <- rep(unlist(R2_P_mode), times = lapply(test.ids, length))
#Add temperature data
output$Pred.T.mean <- unlist(Pred_T_mean)
output$Pred.T.med <- unlist(Pred_T_median)
output$Pred.T.mode <- unlist(Pred_T_mode)
output$Resid.T.mean <- unlist(Resid_T_mean)
output$Resid.T.med <- unlist(Resid_T_median)
output$Resid.T.mode <- unlist(Resid_T_mode)
output$SEE.T.mean <- rep(unlist(SEE_T_mean), times = lapply(test.ids, length))
output$SEE.T.med <- rep(unlist(SEE_T_median), times = lapply(test.ids, length))
output$SEE.T.mode <- rep(unlist(SEE_T_mode), times = lapply(test.ids, length))
output$R2.T.mean <- rep(unlist(R2_T_mean), times = lapply(test.ids, length))
output$R2.T.med <- rep(unlist(R2_T_median), times = lapply(test.ids, length))
output$R2.T.mode <- rep(unlist(R2_T_mode), times = lapply(test.ids, length))
##### SAVE AND EXPORT ####
final <- output
save(final, file = "final.Rdata")
write.csv(final,'final.csv')
#### print the SEE ####
mean(unlist(SEE_P_mean))
mean(unlist(SEE_T_mean))
###############################
#### MAKE THE ACTUAL MODEL ####
###############################
#### INSTALL REQUIRED PACKAGES IF NECESSARY
options(java.parameters = "-Xmx4g")
pack1 <- suppressWarnings(require(PerformanceAnalytics))
if(pack1 == FALSE) {install.packages("PerformanceAnalytics");library(PerformanceAnalytics)}
pack2 <- suppressWarnings(require(rJava))
if(pack2 == FALSE) {install.packages("rJava");library(rJava)}
pack3 <- suppressWarnings(require(extraTrees))
if(pack3 == FALSE) {install.packages("extraTrees");library(extraTrees)}
pack4 <- suppressWarnings(require(EnvStats))
if(pack4 == FALSE) {install.packages("EnvStats");library(EnvStats)}
pack5 <- suppressWarnings(require(xlsx))
if(pack5 == FALSE) {install.packages("xlsx");library(xlsx)}
pack6 <- suppressWarnings(require(readxl))
if(pack6 == FALSE) {install.packages("readxl");library(readxl)}
pack7 <- suppressWarnings(require(rstudioapi))
if(pack7 == FALSE) {install.packages("rstudioapi");library(rstudioapi)}
rm(pack1,pack2,pack3,pack4,pack5,pack6, pack7)
setwd(paste(dirname(rstudioapi::getActiveDocumentContext()$path)))
options(scipen = 999)
'%!in%' <- function(x,y)!('%in%'(x,y))
######### READ DATA AND REMOVE FILTERED EXPERIMENTS #######
#Load raw data
load("input.Rdata")
#### What do you have? ####
#SAME AS #4 or else the SEE you calculated is incorrect!
liq <- c("NoLiquid") #OR Liquid
#Set the oxides
ox <- c("SiO2", "TiO2","Al2O3", "Cr2O3","FeO","MgO", "MnO", "CaO", "Na2O") #Make sure this order stays the same!!
liqox <- c("SiO2", "TiO2","Al2O3", "FeO","MgO", "MnO", "CaO", "Na2O","K2O")
var <- paste0(ox,".cpx") #adds .cpx to the cpx oxides
if(liq == "Liquid") {var <- c(var, paste0(liqox,".liq"))} #adds .liq to the liquid variables
#Hyperparamters
r <- 200                #Number of test/train splits
n.cuts <- 1             #number of random cuts
n.tree <- 201           #number of trees
m.try <-length(var)*2/3 #this should be 12 for liquid and 6 for no liquid.
#Train pressure model
P_C <- extraTrees(x = input[, var], y = input$P, ntree = n.tree, mtry = m.try, numRandomCuts = n.cuts, numThreads = 8)
#Train temperature model
T_C <- extraTrees(x = input[, var], y = input$T, ntree = n.tree, mtry = m.try, numRandomCuts = n.cuts, numThreads = 8)
#Save models
prepareForSave(P_C)
save(P_C, file = "P_C.Rdata")
prepareForSave(T_C)
save(T_C, file = "T_C.Rdata")
################################
##PREPROCESS/FILTER YOUR DATA###
################################
#Same as script 1 and 2 but for your data
#####################
#### INSTALL REQUIRED PACKAGES IF NECESSARY
pack1 <- suppressWarnings(require(readxl))
if(pack1 == FALSE) {install.packages("readxl")}
setwd(paste(dirname(rstudioapi::getActiveDocumentContext()$path)))
#### What do you have? ####
#SAME AS #4 or else the SEE you calculated is incorrect!
liq <- c("NoLiquid") #OR Liquid
###### LOAD YOUR DATA #####
load("OxiWeight.Rdata")
userdat <- read.csv("InputData.csv")
userdat[is.na(userdat)] <- 0
##### CALCULATE MOLAR AND CLINOPYROXENE SPECIFIC SITE CHEMISTRY #####
#Choose oxides
ox <- c("SiO2", "Al2O3", "TiO2", "CaO", "Na2O", "FeO", "MgO", "MnO", "Cr2O3")
#Extract columns that are only oxides
input <- userdat[,paste0(ox, ".cpx")]
colnames(input) <- ox
#The number of oxygens to calculate on the basis of which depends upon the mineral formula you are calculating
OxNum <- 6
#The names of all oxides inside oxiweight
all.ox <- rownames(OxiWeight)
#Take each oxide and divide by its appropriate atomic weight as taken from the data OxiWeight
molprop <- apply(input, MARGIN = 1, function(x) x / round(OxiWeight[ox, 'OWeight'], digits = 2))
#This is the number of oxygens in the oxide formula for each of the oxides.
Ox.frame <- data.frame(all.ox, c(2,2,3,3,1,1,1,1,1,1,5,2,3,1,1,1,1,1,1,1,2,3,3,0,0,0,1))
colnames(Ox.frame) <- c("Oxide", "N")
#This is the numbers from Ox.frame relevant to the raw data inputted
Ox <- Ox.frame[match(ox, Ox.frame$Oxide), "N"]
#calculate the atomic proportion of oxygen. This is the molar proportion multiplied by the number of oxygens in the oxide
AtPropOx <- apply(molprop, MARGIN = 2, function(x) x * Ox)
#Sum the atomic proportion of oxygen to give the total atomic proportion of oxygen
TotalOx <- apply(AtPropOx, MARGIN = 2, function(x) sum(x))
#Calculate the value to use in the recalculation
OxRecalc <- OxNum / TotalOx
#This is the number used to multiply by for the cations in the final step. This is essentially the number of cations
#per oxygen in the oxide formula.
CatRecalc.frame <- data.frame(all.ox, c(2,2,1.5,1.5,1,1,1,1,0.5,0.5,2.5,2,3,1,1,1,1,0.5,0.5,0.5,2,1.5,1.5,2,1,2,0.5))
colnames(CatRecalc.frame) <- c("Oxide", "N")
#This is the numbers from CatRecalc.frame relevant to the raw data inputted
CatRecalc <- CatRecalc.frame[match(ox, CatRecalc.frame$Oxide), "N"]
#Calculate the number of anions on the basis of the desired number of oxygens
AnionsPerOxNum <- apply(AtPropOx, MARGIN = 1, function(x) x * OxRecalc)
#Calculate the number of cations in the desired formula
cations <- as.data.frame(t(apply(AnionsPerOxNum, MARGIN = 1, function(x) x / CatRecalc)))
#Save a clean copy
cats <- cations
#Change column names to cations only
colnames(cats) <- paste0(c("Si", "Al", "Ti", "Ca", "Na", "Fe", "Mg", "Mn", "Cr"), ".cpx")
#Round dataframe
cats <- as.data.frame(apply(cats, 2, round, digits = 3))
#Bind together the data
dat <- cbind(userdat, cats)
#Make a column to decide if the experiment should be removed or not
dat$Rm <- "N"
#Remove obviously bad experiments. These should never be added to the model because they are poor
dat$Rm[which(is.na(apply(cats, 1, sum)))] <- "Y"
dat$Rm[which(apply(cats, 1, sum) > 4.04 | apply(cats, 1, sum) < 3.96)] <- "Y"
#Normalise liquids to 100 wt% anhyrous
liq.cols <- grep(pattern = ".liq", x = colnames(dat))
dat[,liq.cols] <- round(as.data.frame(t(apply(dat[,liq.cols],1,function(x){(x/sum(x))*100}))),2)
#You can filter your data on the basis of Kd here
#(conditional on whether liquid was selected)
if(liq == "Liquid") {
upper.kd <- 0.393
lower.kd <- 0.159
dat$kd <- (dat$FeO.cpx/dat$MgO.cpx) / (dat$FeO.liq/dat$MgO.liq)
dat$Rm[which(is.na(dat$kd) == TRUE)] <- "Y"
dat$Rm[which(dat$kd > upper.kd | dat$kd < lower.kd)] <- "Y"
}
#Remove experiments and set final file for input
input.user <- dat[which(dat$Rm == "N"),]
#Save data
save(input.user, file = "YOURDATA.Rdata")
########################
#####Run the model######
########################
#Load your data in! The SEE for this is what you calculated in #4
#######################
#### INSTALL REQUIRED PACKAGES IF NECESSARY
options(java.parameters = "-Xmx4g")
pack1 <- suppressWarnings(require(PerformanceAnalytics))
if(pack1 == FALSE) {install.packages("PerformanceAnalytics");library(PerformanceAnalytics)}
pack2 <- suppressWarnings(require(rJava))
if(pack2 == FALSE) {install.packages("rJava");library(rJava)}
pack3 <- suppressWarnings(require(extraTrees))
if(pack3 == FALSE) {install.packages("extraTrees");library(extraTrees)}
pack4 <- suppressWarnings(require(EnvStats))
if(pack4 == FALSE) {install.packages("EnvStats");library(EnvStats)}
pack5 <- suppressWarnings(require(xlsx))
if(pack5 == FALSE) {install.packages("xlsx");library(xlsx)}
pack6 <- suppressWarnings(require(readxl))
if(pack6 == FALSE) {install.packages("readxl");library(readxl)}
pack7 <- suppressWarnings(require(rstudioapi))
if(pack7 == FALSE) {install.packages("rstudioapi");library(rstudioapi)}
rm(pack1,pack2,pack3,pack4,pack5,pack6, pack7)
setwd(paste(dirname(rstudioapi::getActiveDocumentContext()$path)))
####load models####
load("P_C.Rdata")
load("T_C.Rdata")
#### What do you have? ####
#SAME AS #4 or else the SEE you calculated is incorrect!
liq <- c("NoLiquid") #OR Liquid
#Load input data
load(file = "YOURDATA.Rdata")
YOURDATA <- input.user
####Load in your data####
#Filter your data using sheet 1 and 2 before this step.
INPUTDATA <- YOURDATA[,c("SiO2.cpx","TiO2.cpx","Al2O3.cpx","Cr2O3.cpx","FeO.cpx","MgO.cpx","MnO.cpx","CaO.cpx","Na2O.cpx")] #make sure the elements are the same and they have the same naming convention
if(liq== "Liquid"){INPUTDATA <- YOURDATA[,c("SiO2.cpx","TiO2.cpx","Al2O3.cpx","Cr2O3.cpx","FeO.cpx","MgO.cpx","MnO.cpx","CaO.cpx","Na2O.cpx","SiO2.liq", "TiO2.liq","Al2O3.liq", "FeO.liq","MgO.liq", "MnO.liq", "CaO.liq", "Na2O.liq","K2O.liq")]}
####Run the models ####
predP <- predict(P_C, newdata = INPUTDATA, allValues=TRUE) #this applies the model to your data
predT <- predict(T_C, newdata = INPUTDATA, allValues=TRUE) #this applies the model to your data
#Create dataframe to save results
OUTPUTDATA <- YOURDATA
#Calculations for pressure (mean, median, mode IQR)
P_mean <- round(apply(predP, 1, mean), digits = 1); OUTPUTDATA$P_mean <- P_mean
P_median <- round(apply(predP, 1, median), digits = 1); OUTPUTDATA$P_median <- P_median
P_tab <- apply(round(predP, digits = 1), 1, table)
P_mode <- unlist(lapply(P_tab, function(x) {as.numeric(names(sort(x, decreasing = T)[1]))})); OUTPUTDATA$P_mode <- P_mode
P_IQR <- round(apply(predP, 1, IQR), digits = 1); OUTPUTDATA$P_IQR <- P_IQR
#Calculations for temperature (mean, median, mode IQR)
T_mean <- round(apply(predT, 1, mean), digits = 0); OUTPUTDATA$T_mean <- T_mean
T_median <- round(apply(predT, 1, median), digits = 0); OUTPUTDATA$T_median <- T_median
T_tab <- apply(round(predT, digits = 0), 1, table)
T_mode <- unlist(lapply(T_tab, function(x) {as.numeric(names(sort(x, decreasing = T)[1]))})); OUTPUTDATA$T_mode <- T_mode
T_IQR <- round(apply(predT, 1, IQR), digits = 0); OUTPUTDATA$T_IQR <- T_IQR
#Create final output dataframe
write.csv(OUTPUTDATA,'OutputData.csv') #save it! :)
#P out outputs are in kbar, T outputs are in celsius
